Lesson 4

Scatterplots

Notes: we use scatterplot to visualize the relationship between two continuous variables.

library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep ='\t')
qplot(x= age, y= friend_count, data= pf)


What are some things that you notice right away?

Response: Younger users have a lot of friends.


ggplot Syntax

Notes: ggplot let us specify more complicated plots.

ggplot(aes(x= age, y= friend_count), data= pf)+
  geom_point() + xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

Overplotting

Notes: overplotting makes it different to tell how many points are in each region, so we can set the transparency of the points using the alpha parameter and geom point.

ggplot(aes(x= age, y= friend_count), data= pf)+
  geom_jitter(alpha=1/20) +
  xlim(13, 90)
## Warning: Removed 5189 rows containing missing values (geom_point).

What do you notice in the plot?

Response: The friend count for young users aren’t nearly as high as they looked before, the bulk of young users really have friend counts below 1000.


Coord_trans()

Notes: with this plot, it’s much easier to see the distribution of friend count.

ggplot(aes(x= age, y= friend_count), data= pf)+
  geom_point(alpha=1/20, position= position_jitter(h=0)) +
  xlim(13, 90)+
  coord_trans(y= 'sqrt')
## Warning: Removed 5187 rows containing missing values (geom_point).


Alpha and Jitter

Notes: we use alpha to reduce overplotting

ggplot(aes(x= age, y= friendships_initiated), data= pf)+
  geom_point(alpha= 1/20, position= position_jitter(h=0))+
  coord_trans(y='sqrt')


Conditional Means

Notes: Created another data frame that contains the mean and the median of friend count for each age.

library("dplyr")
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pf.fc_by_age <- 
  pf %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n<- n()) %>%
  arrange(age)

head(pf.fc_by_age, 20)
## # A tibble: 20 x 4
##      age friend_count_mean friend_count_median `n <- n()`
##    <int>             <dbl>               <dbl>      <int>
##  1    13              165.                74          484
##  2    14              251.               132         1925
##  3    15              348.               161         2618
##  4    16              352.               172.        3086
##  5    17              350.               156         3283
##  6    18              331.               162         5196
##  7    19              334.               157         4391
##  8    20              283.               135         3769
##  9    21              236.               121         3671
## 10    22              211.               106         3032
## 11    23              203.                93         4404
## 12    24              186.                92         2827
## 13    25              131.                62         3641
## 14    26              144.                75         2815
## 15    27              134.                72         2240
## 16    28              126.                66         2364
## 17    29              121.                66         1936
## 18    30              115.                67.5       1716
## 19    31              118.                63         1694
## 20    32              114.                63         1443

Create your plot!

ggplot(aes(x= age, y= friend_count_mean), data= pf.fc_by_age) +
  geom_line()


Overlaying Summaries with Raw Data

Notes: displaying multiple summaries at the same time on the plot.

ggplot(aes(x= age, y= friend_count), data= pf)+
  xlim(13, 90) +
  geom_point(alpha= 0.05,
             position= position_jitter(h = 0),
             color= 'orange')+
  coord_trans(y = 'sqrt') +
  geom_line( stat= 'summary', fun.y= mean) +
  geom_line(stat= 'summary', fun.y= quantile,
            fun.args= list(probs = .1),
            linetype= 2, color= 'blue') +
  geom_line(stat= 'summary', fun.y= quantile,
            fun.args= list(probs = .5),
            color= 'blue') +
  geom_line(stat= 'summary', fun.y= quantile,
            fun.args= list(probs = .9),
            linetype= 2, color= 'blue')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5162 rows containing missing values (geom_point).

What are some of your observations of the plot?

Response: more than 1000 friend is rare, 90% have less than 1000 friends.


Correlation

Notes: We used the Pearson product moment correlation to measure the linear relationship between age and friend count.

cor.test(pf$age, pf$friend_count, method= 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response: -0.027


Correlation on Subsets

Notes: We don’t want to include the older ages in our correlation number, since older ages are likely to be incorrect.

with(subset(pf, age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Notes: Correlation Methods: Pearson’s r, Spearman’s rho, and Kendall’s tau.


Create Scatterplots

Notes: We will look at the number of likes users recieved from friends on the desktop version on the site and compare it to the total number of likes users recieved.

ggplot(aes(x= www_likes_received, y= likes_received), data = pf) +
  geom_point()


Strong Correlations

ggplot(aes(x= www_likes_received, y= likes_received), data = pf) +
  geom_point() +
  xlim(0, quantile(pf$www_likes_received, 0.95))+
  ylim(0, quantile(pf$likes_received, 0.95)) +
  geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

More Caution with Correlation

library(alr3)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data("Mitchell")

Create your plot!

ggplot(data = Mitchell, aes(x= Month, y= Temp)) +
  geom_point()


Noisy Scatterplots

cor.test(Mitchell$Month, Mitchell$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes: We should break months into 12 as we have 12 months in the year

ggplot(data = Mitchell, aes(x= Month, y= Temp)) +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 203, 12))


A New Perspective

What do you notice? Response: When stretching out of the graph, we notice that we get more of a cyclical pattern because there are seasons in Nebraska.


Understanding Noise: Age to Age Months

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
  geom_line()

pf$age_with_months <- pf$age + (12- pf$dob_month)/12

Age with Months Means

pf.fc_by_age_months <- pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age_with_months)

head(pf.fc_by_age_months)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1            13.2              46.3                30.5     6
## 2            13.2             115.                 23.5    14
## 3            13.3             136.                 44      25
## 4            13.4             164.                 72      33
## 5            13.5             131.                 66      45
## 6            13.6             157.                 64      54

Noise in Conditional Means

ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months <71)) +
  geom_line()


Smoothing Conditional Means

p1 <- ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age, age <71)) +
  geom_line()+
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months <71)) +
  geom_line()+
  geom_smooth()

p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count), data = subset(pf, age <71)) +
  geom_line(stat = 'summary', fun.y = mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p2, p1, p3, ncol = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'


Which Plot to Choose?

Notes: In exploratory data analysis we often create multiple visualizations and summaries of the same data, gleaning different incites from each.


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!